Loading Libraries and Reading Data

# Set seed for reproducibility
set.seed(123)

# Register Stadia Maps API key
api_key <- "45381642-ef1c-4009-90a4-e54942f742c7"
register_stadiamaps(key = api_key)

# Get the name of working directory in RProject
proj.path <- getwd()

# Read the raw hotspots data from the data folder in the RProject directory
hotspots_raw <- read_csv(file.path(proj.path, 'data', 'hotspots.csv'))
## Rows: 1781266 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): source, sensor, satellite, agency, fuel
## dbl  (33): lat, lon, uid, temp, rh, ws, wd, pcp, ffmc, dmc, dc, isi, bui, fw...
## dttm  (1): rep_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Subset for period 2014-2023, add month and day columns

# Convert rep_date to POSIXct format
hotspots_raw$rep_date <- as.POSIXct(hotspots_raw$rep_date, format = "%Y-%m-%d %H:%M:%S")

# Add year and month columns to the data
hotspots_raw$year <- year(hotspots_raw$rep_date)
hotspots_raw$month <- month(hotspots_raw$rep_date, label = TRUE, abbr = TRUE)

# Filter the data to include only records from 2014 to 2023
hotspots <- hotspots_raw %>%
  filter(year >= 2014 & year <= 2023)

Visualizing and Focusing on Specific Time Periods

# Summarize the number of fire events per year and month
# This helps us understand the distribution of fire events over time
fire_events_per_month <- hotspots %>%
  group_by(year, month) %>%
  summarise(n_events = n(), .groups = 'drop') 

# Print the wide format summary table to inspect the reshaped data
print(fire_events_wide)
## # A tibble: 10 × 13
##     year   Apr   May   Jun    Jul    Aug    Sep   Oct   Nov   Dec   Mar   Jan
##    <dbl> <int> <int> <int>  <int>  <int>  <int> <int> <int> <int> <int> <int>
##  1  2014    61   215   114  10474  20910   1996  2443  3952    78     0     0
##  2  2015   221  7840  4920  18147   4690    921  8061  3134     0   281     0
##  3  2016  1667  6453   137    191   1321   1492  1500  1814   269   101     0
##  4  2017    88   435   320  79606 152262  23580 13943   835     0    87     0
##  5  2018   244  1992  1477   5790 336899   9402  9795  2350     0    40    46
##  6  2019   310  1838   246    256   3754   3494 18946  4067     0   176     0
##  7  2020    51   209    40    112   3644   2429  5073   818   252    93     0
##  8  2021   719   283  2739 157537  64954    896  6423  3883   480   357   706
##  9  2022    96    50   144   2513   5967  24132 13781  4718   358   238   261
## 10  2023   528 43256 89281 150531 161000 237760  5999  6019  1197   192   439
## # ℹ 1 more variable: Feb <int>

Focus on the peak fire season from May to October.

Filter the data to include only these months.

hotspots_peak <- hotspots %>%
  filter(month(rep_date) %in% c(5, 6, 7, 8, 9, 10))
# Print the wide format summary table for the peak season
print(fire_events_subset_wide)
## # A tibble: 10 × 7
##     year   May   Jun    Jul    Aug    Sep   Oct
##    <dbl> <int> <int>  <int>  <int>  <int> <int>
##  1  2014   215   114  10474  20910   1996  2443
##  2  2015  7840  4920  18147   4690    921  8061
##  3  2016  6453   137    191   1321   1492  1500
##  4  2017   435   320  79606 152262  23580 13943
##  5  2018  1992  1477   5790 336899   9402  9795
##  6  2019  1838   246    256   3754   3494 18946
##  7  2020   209    40    112   3644   2429  5073
##  8  2021   283  2739 157537  64954    896  6423
##  9  2022    50   144   2513   5967  24132 13781
## 10  2023 43256 89281 150531 161000 237760  5999

Clustering using DBSCAN

Prepare the data for clustering (latitude, longitude, date)
Convert latitude and longitude from degrees to kilometers (1 degree ~ 111 km)
Calculate time in hours from the first recorded event
Note: time_hours is a component in clustering
to group together events occurring within a certain time frame (20 hours).

event_data <- hotspots_peak %>%
  select(lat, lon, rep_date) %>%
  mutate(
    lat_km = lat * 111,
    lon_km = lon * 111,
    time_hours = as.numeric(difftime(rep_date, min(rep_date), units = "hours")) 
  ) %>%
  select(lat_km, lon_km, time_hours)  # Reassign event_data to include only the necessary columns
# Function to apply DBSCAN and count clusters
apply_dbscan <- function(data, eps_value, minPts_value) {
  db <- dbscan(data, eps = eps_value, minPts = minPts_value)  # Apply DBSCAN
  data$event_cluster <- db$cluster  # Add cluster labels
  num_clusters <- length(unique(data$event_cluster)) - 1  # Count clusters excluding noise (0)
  num_noise <- sum(data$event_cluster == 0)  # Count noise points
  cat("eps =", eps_value, ", minPts =", minPts_value, ": Number of clusters =", num_clusters, ", Number of noise points =", num_noise, "\n")
  return(num_clusters)
}

# Parameters for the loop
desired_clusters <- 15095  # Official number of fires
best_eps <- NA  # Placeholder for best eps value
best_minPts <- NA  # Placeholder for best minPts value
best_diff <- Inf  # Placeholder for the smallest difference

# Iterate over eps and minPts values to find the best match
for (eps_val in seq(5, 15, by = 1)) {  # Adjust range and step size as needed
  for (minPts_val in seq(1, 5, by = 1)) {  # Adjust range and step size as needed
    num_clusters <- apply_dbscan(event_data, eps_val, minPts_val)
    diff <- abs(num_clusters - desired_clusters)
    if (diff < best_diff) {  # Check if this is the best match so far
      best_eps <- eps_val
      best_minPts <- minPts_val
      best_diff <- diff
    }
  }
}

# Print out the best parameters found
cat("Best eps value:", best_eps, "\n")  # Best eps value: 11
cat("Best minPts value:", best_minPts, "\n")  # Best minPts value: 2

The best parameters found mean that fire events within an 11 km radius
are grouped into clusters,
and 2 events minimum form a cluster.

# Apply DBSCAN with the best parameters found
best_eps <- 11
best_minPts <- 2
best_db <- dbscan(event_data, eps = best_eps, minPts = best_minPts)

# Add cluster labels to the original dataset
hotspots_peak$event_cluster <- best_db$cluster

# Filter out noise (cluster 0)
# Cluster 0 represents noise points, which are outliers that do not belong to any cluster.
hotspots_peak <- hotspots_peak %>%
  filter(event_cluster != 0)

# Number of clusters identified by DBSCAN
cat("Number of clusters identified by DBSCAN:", length(unique(hotspots_peak$event_cluster)), "\n")
## Number of clusters identified by DBSCAN: 15026
# Summary of clusters
cluster_summary <- hotspots_peak %>%
  group_by(event_cluster) %>%
  summarise(
    start_date = min(rep_date),  # Get the earliest date in the cluster
    end_date = max(rep_date),  # Get the latest date in the cluster
    num_events = n(),  # Count the number of events in the cluster
    latitude = mean(lat),  # Calculate the average latitude of events in the cluster
    longitude = mean(lon)  # Calculate the average longitude of events in the cluster
  )
# Get the top 5 largest clusters from the cluster_summary
top_clusters <- cluster_summary %>%
  arrange(desc(num_events)) %>%
  slice_head(n = 5) 

# Print the top clusters
print(top_clusters)
## # A tibble: 5 × 6
##   event_cluster start_date          end_date            num_events latitude
##           <int> <dttm>              <dttm>                   <int>    <dbl>
## 1          3930 2018-08-04 16:56:00 2018-08-22 20:06:00      62103     53.0
## 2          2408 2017-07-30 04:31:00 2017-08-13 02:04:00      61622     52.7
## 3         12440 2023-09-14 03:57:00 2023-09-24 04:08:00      55183     58.7
## 4          3928 2018-08-05 16:38:00 2018-08-22 20:06:00      30660     54.2
## 5          2388 2017-07-23 02:47:00 2017-08-13 05:00:00      30152     51.1
## # ℹ 1 more variable: longitude <dbl>

Visualize the clusters

# Function to plot clusters on map
plot_clusters_on_map <- function(cluster_ids, data, zoom_level = 10) {
  # Filter data for the specific clusters
  cluster_data <- data %>% filter(event_cluster %in% cluster_ids)
  
  # Define the bounding box for the map based on the clusters' coordinates
  bbox <- c(left = min(cluster_data$lon) - 0.3, 
            bottom = min(cluster_data$lat) - 0.3, 
            right = max(cluster_data$lon) + 0.3, 
            top = max(cluster_data$lat) + 0.3)
  
  # Get the map from Stadia Maps API
  map <- get_stadiamap(bbox = bbox, zoom = zoom_level, maptype = "stamen_terrain")
  
  
  
  # Plot the clusters data on the map
  print(ggmap(map) +
          geom_point(data = cluster_data, aes(x = lon, y = lat, color = factor(event_cluster)), size = 2, alpha = 0.3, shape = 16) +  
          labs(title = "Matched Fire Clusters", 
               x = "Longitude", 
               y = "Latitude", 
               color = "Cluster ID") +
          theme_minimal() +
          theme(legend.position = "bottom") +
          scale_color_manual(values = c("firebrick1", "orange", "yellow", "orange3", "yellow2", "red3", "red4")) # Custom colours
  ) 
}

ID 3930 Tweedsmuir Complex Fire (2018)

ID 2408 Elephant Hill Fire (2017)

ID 12440 Fort Nelson Fire (2023)

ID 3928 Shovel Lake Fire (2018)

ID 2388 Hanceville-Riske Creek Fire (2017)

## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.

The plots show large areas affected by fires.
Each fire cluster covers a big region, indicating major fire events.
Fire events within clusters are closely packed, showing high fire activity.
The fires follow the terrain, influenced by vegetation and topography.

Kelowna Fire Event Analysis

# Known fire near Kelowna in 2023
kelowna_fire <- data.frame(
  fire_name = "McDougall Creek Fire",
  start_date = as.POSIXct("2023-08-16"),
  end_date = as.POSIXct("2023-08-30"),
  latitude = 49.8821,
  longitude = -119.4370
)
# Calculate the distance from each cluster to Kelowna fire
cluster_summary <- cluster_summary %>%
  mutate(
    distance_to_kelowna_fire = distHaversine(
      matrix(c(longitude, latitude), ncol = 2),
      matrix(c(kelowna_fire$longitude, kelowna_fire$latitude), ncol = 2)
    ) / 1000  # Convert meters to kilometers
  )

# Filter clusters that are within 50 km and have suitable dates
kelowna_clusters <- cluster_summary %>%
  filter(
    distance_to_kelowna_fire < 50 &  # Check if within 50 km
      start_date <= kelowna_fire$end_date &  # Check if cluster starts before the known fire ends
      end_date >= kelowna_fire$start_date    # Check if cluster ends after the known fire starts
  )

# Print the matching clusters
print(kelowna_clusters)
## # A tibble: 7 × 7
##   event_cluster start_date          end_date            num_events latitude
##           <int> <dttm>              <dttm>                   <int>    <dbl>
## 1         11964 2023-08-25 12:44:00 2023-08-29 17:49:00        982     50.0
## 2         11969 2023-08-16 16:54:00 2023-08-20 05:05:00       2770     49.9
## 3         13047 2023-08-20 16:28:00 2023-08-21 04:46:00        446     50.0
## 4         13124 2023-08-21 16:09:00 2023-08-22 04:01:00        252     49.9
## 5         13384 2023-08-23 01:51:00 2023-08-23 04:59:00         15     49.9
## 6         13402 2023-08-23 03:19:00 2023-08-23 04:08:00          4     50.0
## 7         13694 2023-08-25 01:36:00 2023-08-25 04:29:00          8     49.9
## # ℹ 2 more variables: longitude <dbl>, distance_to_kelowna_fire <dbl>
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.

Variables

The dataset contains different variables
that are important to analyze fire events in British Columbia.
These variables can be broadly categorized into the following categories:
Location, Time, Data Collection Methods, Weather Data, Fire Indices, Other.

lat: Latitude of the fire event.
lon: Longitude of the fire event.

# Summary of Latitude and Longitude
lat_range <- range(hotspots$lat, na.rm = TRUE)
lon_range <- range(hotspots$lon, na.rm = TRUE)

# British Columbia coordinates:
# Latitude: Between 48.309789°N and 60.00065°N
# Longitude: Between -139.058200 °W and -114.05423°W

cat("Latitude Range:", lat_range, "\n")
## Latitude Range: 48.32883 60.0009
cat("Longitude Range:", lon_range, "\n")
## Longitude Range: -138.328 -114.09

rep_date: The date and time when the fire event was reported.
month and year are new variables added

Full Dataset Analysis

# Summary table describing each hotspots dataset by year
hotspots_df_summary <- hotspots %>%
  group_by(year) %>%
  summarise(
    start_date = min(rep_date),
    end_date = max(rep_date),
    hotspots_df_day_count = as.numeric(difftime(max(rep_date), min(rep_date), units = "days"))
  )

print(hotspots_df_summary)
## # A tibble: 10 × 4
##     year start_date          end_date            hotspots_df_day_count
##    <dbl> <dttm>              <dttm>                              <dbl>
##  1  2014 2014-04-04 18:50:00 2014-12-07 20:37:00                  247.
##  2  2015 2015-03-07 06:05:00 2015-11-22 21:23:00                  261.
##  3  2016 2016-03-25 04:10:00 2016-12-21 04:55:00                  271.
##  4  2017 2017-03-03 04:15:00 2017-11-15 11:41:00                  257.
##  5  2018 2018-01-06 03:11:00 2018-11-07 18:19:00                  306.
##  6  2019 2019-03-13 02:06:00 2019-11-19 18:54:00                  252.
##  7  2020 2020-03-03 03:45:00 2020-12-29 05:05:00                  301.
##  8  2021 2021-01-06 13:35:00 2021-12-11 18:36:00                  339.
##  9  2022 2022-01-11 13:22:00 2022-12-31 19:06:00                  354.
## 10  2023 2023-01-01 05:01:00 2023-12-31 19:14:00                  365.

This barplot shows, how many days each year had recorded fire events.
It also shows that reporting period has grown over the years.

Peak Season Analysis

# Summary table for the peak season (May to October) describing each hotspots dataset by year
hotspots_peak_df_summary <- hotspots_peak %>%
  group_by(year) %>%
  summarise(
    start_date = min(rep_date),
    end_date = max(rep_date),
    hotspots_peak_day_count = as.numeric(difftime(max(rep_date), min(rep_date), units = "days")),
    num_clusters = n_distinct(event_cluster)  # Number of clusters in the peak season for each year
  )

print(hotspots_peak_df_summary)
## # A tibble: 10 × 5
##     year start_date          end_date            hotspots_peak_day_count
##    <dbl> <dttm>              <dttm>                                <dbl>
##  1  2014 2014-05-01 18:35:00 2014-10-31 23:44:00                    183.
##  2  2015 2015-05-03 05:17:00 2015-10-31 22:23:00                    182.
##  3  2016 2016-05-01 04:44:00 2016-10-31 06:30:00                    183.
##  4  2017 2017-05-02 19:49:00 2017-10-31 21:27:00                    182.
##  5  2018 2018-05-01 16:41:00 2018-10-31 04:40:00                    182.
##  6  2019 2019-05-01 03:47:00 2019-10-31 18:48:00                    184.
##  7  2020 2020-05-03 03:48:00 2020-10-31 16:07:00                    182.
##  8  2021 2021-05-05 02:24:00 2021-10-31 18:47:00                    180.
##  9  2022 2022-05-04 01:49:00 2022-10-31 18:02:00                    181.
## 10  2023 2023-05-01 01:09:00 2023-10-31 18:57:00                    184.
## # ℹ 1 more variable: num_clusters <int>

The table shows the earliest and latest dates of fire event reporting,
the total number of days with recorded fire events,
and the number of unique fire clusters detected during the peak season for each year.

The barplot shows the number of unique fire clusters
detected each year during the peak fire season.
This number fluctuates each year,
with noticeable peaks in 2018, 2021, and 2023.

Data Collection Methods

Different sources, satellites, and sensors are used for data collection.

# Summary and Visualization of Sources
source_summary <- hotspots %>% 
  group_by(source) %>% 
  summarise(num_events = n())

print(source_summary)
## # A tibble: 15 × 2
##    source   num_events
##    <chr>         <int>
##  1 HMS            4158
##  2 NASA         232234
##  3 NASA1          5013
##  4 NASA2        339423
##  5 NASA3        330797
##  6 NASA6          5091
##  7 NASA7          5410
##  8 NASA_can      12899
##  9 NASA_usa        435
## 10 NASAkmz          89
## 11 NASAwcan      49947
## 12 NASAwusa        143
## 13 NOAA         172479
## 14 UMD           28514
## 15 USFS         589933

# Summary and Visualization of Satellites
satellite_summary <- hotspots %>% 
  group_by(satellite) %>% 
  summarise(num_events = n())

print(satellite_summary)
## # A tibble: 15 × 2
##    satellite num_events
##    <chr>          <int>
##  1 Aqua           80449
##  2 JPSS1           7152
##  3 L8              1297
##  4 Landsat-8         16
##  5 METOP-A        49806
##  6 METOP-B        40999
##  7 N                767
##  8 NOAA-15         9296
##  9 NOAA-18        28695
## 10 NOAA-19        35724
## 11 NOAA-20       432901
## 12 NPP             1201
## 13 S-NPP         927884
## 14 Terra         110268
## 15 <NA>           50110

# Summary and Visualization of Sensors
sensor_summary <- hotspots %>% 
  group_by(sensor) %>% 
  summarise(num_events = n())

#Advanced Very High Resolution Radiometer (AVHRR) imagery, courtesy of the NOAA
#Moderate Resolution Imaging Spectroradiometer (MODIS) imagery, courtesy of the NASA
#Visible Infrared Imaging Radiometer Suite (VIIRS) imagery, courtesy of NASA LANCE FIRMS

print(sensor_summary)
## # A tibble: 8 × 2
##   sensor  num_events
##   <chr>        <int>
## 1 AVHRR       164520
## 2 IBAND       400710
## 3 Landsat       1014
## 4 MODIS       240827
## 5 OLI            299
## 6 VIIRS        94345
## 7 VIIRS-I     869363
## 8 VIIRS-M       5487

Example of a Specific Fire Event: Kelowna Fire Cluster (Cluster ID: 13928)

kelowna_cluster_ids <- kelowna_clusters$event_cluster
# Filter the data for the specific cluster
event_data <- hotspots_peak %>%
  filter(event_cluster %in% kelowna_cluster_ids)

# Summarize the sources, satellites, and sensors used for this event
event_sources <- event_data %>%
  group_by(source) %>%
  summarise(num_events = n())

event_satellites <- event_data %>%
  group_by(satellite) %>%
  summarise(num_events = n())

event_sensors <- event_data %>%
  group_by(sensor) %>%
  summarise(num_events = n())

# Print summaries
print(event_sources)
## # A tibble: 7 × 2
##   source   num_events
##   <chr>         <int>
## 1 NASA2          1658
## 2 NASA3          1557
## 3 NASA6           460
## 4 NASA7           404
## 5 NASA_can          5
## 6 NASAwcan        366
## 7 NASAwusa         27
print(event_satellites)
## # A tibble: 5 × 2
##   satellite num_events
##   <chr>          <int>
## 1 Aqua               3
## 2 NOAA-20         1961
## 3 S-NPP           2118
## 4 Terra              2
## 5 <NA>             393
print(event_sensors)
## # A tibble: 2 × 2
##   sensor  num_events
##   <chr>        <int>
## 1 MODIS          398
## 2 VIIRS-I       4079

In the Kelowna fire clusters,
data came from various sources, satellites, and sensors.
Most data were provided by NASA sources and the VIIRS-I sensor,
mainly using the S-NPP satellite.
The VIIRS-I sensor and S-NPP satellite are the most used
both for this event and the overall data.

Weather Data Variables

Weather conditions are critical factors in the spread of fires.

temp: Temperature (°C)
rh: Relative Humidity (%)
ws: Wind Speed (km/h)
wd: Wind Direction (degrees)
pcp: Precipitation (mm)

# Function to describe each numerical column 
describe_numerical <- function(df, cols) {
  summary_list <- list()
  
  for (col in cols) {
    summary_stats <- data.frame(
      Variable = col,
      Missing_Values = sum(is.na(df[[col]])),
      Min = round(min(df[[col]], na.rm = TRUE), 2),
      Median = round(median(df[[col]], na.rm = TRUE), 2),
      Mean = round(mean(df[[col]], na.rm = TRUE), 2),
      Max = round(max(df[[col]], na.rm = TRUE), 2)
    )
    summary_list[[col]] <- summary_stats
  }
  
  summary_table <- bind_rows(summary_list)
  return(summary_table)
}

# Summary of weather data variables
summary_hotspots <- describe_numerical(hotspots_peak, c('temp', 'rh', 'ws', 'wd', 'pcp'))
print(summary_hotspots)
##   Variable Missing_Values   Min Median   Mean    Max
## 1     temp              0 -9.66  22.24  21.48  43.13
## 2       rh              0 11.00  33.00  35.48 100.00
## 3       ws              0  0.00   8.95   9.53  59.30
## 4       wd              0  0.00 213.00 202.25 360.00
## 5      pcp              0  0.00   0.00   0.23  78.59

There are no missing values in the weather data.
The minimum temperature recorded is -9.66°C, which is unusually low for fire events.
The maximum temperature recorded is 43.13°C, which is expected during peak fire seasons.
The relative humidity ranges from 11.00% to 100.00%, there are different weather conditions.
Wind speeds range from 0.00 km/h to 59.3 km/h.
There is a full range of possible wind directions.
The precipitation data has a maximum value of 78.59.

# Summarize the count of subzero temperature events by year and month
subzero_temps_summary <- hotspots_peak %>%
  filter(temp < 0) %>%
  group_by(year, month) %>%
  summarise(count = n(), .groups = 'drop') %>%
  arrange(desc(year))

# Print the summary table
print(subzero_temps_summary)
## # A tibble: 11 × 3
##     year month count
##    <dbl> <ord> <int>
##  1  2023 Oct    1060
##  2  2022 Oct      82
##  3  2021 Oct     213
##  4  2020 Oct     666
##  5  2019 May       5
##  6  2019 Oct    1228
##  7  2018 Oct      22
##  8  2017 Oct      91
##  9  2016 Oct       2
## 10  2015 Oct     811
## 11  2014 Oct       9

The peak month for subzero temperature events is October,
even colder conditions do not stop fires from happening.

Temperature (°C)

The median temperatures are between 20°C and 25°C.
2019 stands out with a slightly lower median temperature.
There are significant outliers present in all years,
particularly in 2014, 2017, 2018, 2021 and 2023.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

There is a clear seasonal pattern,
temperatures peak around July and August, then decrease towards October.
Some years, like 2014 and 2021, show higher peaks, meaning hotter summer months.

Relative Humidity (%)

Median relative humidity falls between 30% and 40%.
In 2019, median values are higher, indicating more humid season.
All years have significant outliers, especially 2014, 2017, and 2021, showing extremely humid days.

## `geom_smooth()` using formula = 'y ~ x'

Humidity values typically increase from August to October.
Notable peaks in summer are seen in 2016, 2019 and 2020.

Wind Speed (km/h)

The median wind speeds generally fall between 7 and 9 km/h.
There are significant outliers in all years, particularly noticeable in 2014, 2015, and 2018,
there were days with extremely high wind speeds.

## `geom_smooth()` using formula = 'y ~ x'

There is a clear seasonal pattern where wind speeds tend to be higher in the summer months and decrease towards October.
Some years, like 2015 and 2020, show more pronounced peaks in wind speed.
The overall trend remains consistent with seasonal variations.

Wind direction (degrees)

Visualize with Wind Rose to see most common wind direction using converted wind speed.

# Convert wind speed from km/h to m/s
hotspots_peak <- hotspots_peak %>%
  mutate(ws_m_s = ws / 3.6)

windRose(mydata = hotspots_peak, ws = "ws_m_s", wd = "wd", 
         main = "Wind Rose", paddle = FALSE)

Most of the wind comes from the west and southwest.
Regions east and north of areas with strong western, southern, and southwestern winds
should be careful, as these winds can quickly spread fires.

Precipitation (mm)

The median values for precipitation are consistently very low, often close to zero.
Despite this, there are significant outliers in every year, particularly in 2017 and 2023.
These outliers indicate that certain days had much higher than average rainfall.

## `geom_smooth()` using formula = 'y ~ x'

There is a notable peak in July for several years,
with some years, like 2018, 2019 and 2020,
showing higher average precipitation during these months.

Outliers

Since precipitation often has many zero entries,
need to handle these outliers carefully to ensure they don’t skew our results.

# Define the function to remove outliers
remove_outliers <- function(data, variable) {
  q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  lower_bound <- q1 - 1.5 * iqr
  upper_bound <- q3 + 1.5 * iqr
  data %>% filter(data[[variable]] >= lower_bound & data[[variable]] <= upper_bound)
}
# Remove outliers for temperature, relative humidity, and wind speed
hotspots_peak_clean <- hotspots_peak %>%
  remove_outliers("temp") %>%
  remove_outliers("rh") %>%
  remove_outliers("ws")

# Check the summary statistics after removing outliers
summary_clean <- describe_numerical(hotspots_peak_clean, c('temp', 'rh', 'ws', 'wd', 'pcp'))
print(summary_clean)
##   Variable Missing_Values   Min Median   Mean    Max
## 1     temp              0  7.52  22.70  22.29  36.19
## 2       rh              0 11.00  32.00  33.51  59.00
## 3       ws              0  1.80   8.87   9.28  16.66
## 4       wd              0  0.00 212.00 202.28 360.00
## 5      pcp              0  0.00   0.00   0.14  46.46

The boxplots now show a more consistent distribution of temperatures across different years.
The majority of the years have median temperatures between 20°C and 25°C.
The extreme outliers have been reduced, providing a clearer view of the central temperature tendencies.

The median RH values generally remain between 30% and 40% across the years.
2019 still shows higher median RH, indicating more humid conditions.
Significant outliers have been reduced, giving a better understanding of the general RH trends.

Median wind speeds for each year are now clearly visible, generally falling between 7 and 10 km/h.
The removal of outliers helps to see typical wind speeds more accurately.
The boxplots show that some years, like 2020 and 2021, still have a wider range of wind speeds.

Fire Indices

These indices are derived from weather conditions and provide insights into fire behavior and potential risks.

ffmc: Fine Fuel Moisture Code
dmc: Duff Moisture Code
dc: Drought Code
isi: Initial Spread Index
bui: Buildup Index
fwi: Fire Weather Index

# Summary of fire indices
summary_fire_indices <- describe_numerical(hotspots_peak, c('ffmc', 'dmc', 'dc', 'isi', 'bui', 'fwi'))
print(summary_fire_indices)
##   Variable Missing_Values Min Median   Mean     Max
## 1     ffmc              0 0.3  91.50  89.53   99.00
## 2      dmc              0 0.0  85.20  92.60  459.78
## 3       dc              0 0.0 531.77 518.81 1122.11
## 4      isi              0 0.0   8.62   8.50  137.70
## 5      bui              0 0.0 119.12 123.42  458.66
## 6      fwi              0 0.0  30.09  29.13  183.90

FFMC

A numeric rating of the moisture content of litter and other cured fine fuels.
This code is an indicator of the relative ease of ignition and the flammability of fine fuel.

The FFMC scale ranges from 0 to 101.
0-30: Very wet conditions; ignition is difficult.
30-70: Damp conditions; moderate difficulty for ignition.
70-85: Dry conditions; fuels are easily ignitable.
85-101: Very dry conditions; fuels are highly ignitable and fires can spread rapidly.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The values in the dataset show that during the peak season, the FFMC values are generally high, with a median around 91.50 and a maximum of 99.00. This suggests that the fine fuels are often in a dry state, making them highly ignitable and prone to rapid fire spread.

Most years have high FFMC values above 90. Only 2019 and 2020 show slightly lower values.

## `geom_smooth()` using formula = 'y ~ x'

The lineplot shows that FFMC values peak every year during the fire season.

Remove outliers

After removing outliers, the boxplots for FFMC show a more consistent distribution across the years.
The median values are mostly above 90.

Duff Moisture Code

A numeric rating of the average moisture content
of loosely compacted organic layers of moderate depth.
This code gives an indication of fuel consumption in moderate duff layers
and medium-size woody material.

It ranges typically from 0 to over 200, higher values indicate drier conditions and a higher fire risk.

0-20: Low fire danger, duff layers are wet, and ignition is unlikely. 21 - 40: Moderate fire danger, duff layers start drying. 41 - 100: High fire danger, the duff layer is dry, prone to ignition. 100 + : Extreme fire danger, the duff layer is very dry, and ignition is very likely with the potential for intense fires.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram shows that most DMC values range between 0 and 150, with a peak around 50-100.
Values above 200 are less common but indicate long dry periods.

The boxplots for each year show that 2017 and 2018 had particularly high outliers,
indicating extremely dry conditions.
2019 had lower DMC values, showing less severe drought.

## `geom_smooth()` using formula = 'y ~ x'

The lineplot indicates that DMC values peak in late summer (August) and decrease in autumn.

Remove outliers

After removing the outliers, the DMC values became more consistent across different years.
The median values now mostly fall between 50 and 100, providing a clearer picture of typical drought conditions.
While 2017 and 2021 still show higher values, the extremes have been reduced.

Drought Code

A numeric rating of the average moisture content of deep, compact organic layers.
This code is a useful indicator of seasonal drought effects on forest fuels and the amount of smoldering
in deep duff layers and large logs.

0-100: Indicates wet conditions with low fire potential.
100-300: Moderate drought conditions with increasing fire potential.
300-500: High drought conditions, leading to high fire risk.
500+: Indicates extreme drought, posing a very high fire risk and potential for intense, prolonged burning.

DC is important because it helps understand how dry the forest floor is
and the chance for deep-burning fires.
Higher DC values usually mean longer dry periods and can show how severe fire seasons might be.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram of DC values from 2014 to 2023 shows most values are between 300 and 700, peaking around 500.
This means BC often has conditions that can lead to deep-burning fires.

The boxplot of DC values across years show the variability and trends in drought conditions over time.
2017 and 2023 have higher median values, meaning very dry conditions.
2016 and 2019 show lower DC values, suggesting wetter conditions and potentially less severe fire activity.

## `geom_smooth()` using formula = 'y ~ x'

The line plot of DC values by month shows a clear seasonal trend,
DC values increase during the summer months (July to September) and decrease in the autumn.
This occurs at the time of the peak fire season.
There are a lot of high DC values, more than 500, it shows conditions for deep-burning fires.

Remove outliers

The median DC values are more consistent, generally falling between 250 and 550.
The range of the data has been reduced, with fewer extreme outliers.

Compare indices to the number of events

Create a combined plot from monthly average indices and fire events per month tables
to show the trends of the FFMC, DMC and DC and the number of fire occurrences over time.

# Merge the dataframes
monthly_data <- merge(monthly_avg, fire_events_per_month, by = c("year", "month"))

# Convert 'month' column from ordered factor to numeric
monthly_data$month <- as.numeric(monthly_data$month)

# Create the 'Date' column using 'year' and 'month'
monthly_data$Date <- as.Date(paste(monthly_data$year, monthly_data$month, "01", sep = "-"))

# Normalize the fire numbers data 
max_events <- max(monthly_data$n_events, na.rm = TRUE)

monthly_data <- monthly_data %>%
  mutate(norm_n_events = n_events / max_events)

# Maximum values for scaling
max_ffmc <- max(monthly_data$avg_ffmc, na.rm = TRUE)
max_dmc <- max(monthly_data$avg_dmc, na.rm = TRUE)
max_dc <- max(monthly_data$avg_dc, na.rm = TRUE)

The plots show a clear relationship between the fire indices (FFMC, DMC, and DC) and the number of fires.
When the number of fires is high, the indices also show higher values.

It’s important to note that while high indices indicate conditions favorable for fires,
they alone do not cause fires.

Initial Spread Index

A numerical rating of the expected rate of fire spread
based on wind speed, temperature, and fine fuel moisture content.
ISI is crucial for understanding how quickly a fire can spread once it has ignited.

0-3: Low spread potential. Fires will spread slowly and are relatively easy to control.
4-7: Moderate spread potential. Fires spread more quickly and may require more effort to control.
8-12: High spread potential. Fires spread rapidly and can be difficult to control.
13-19: Very high spread potential. Fires spread very rapidly and are challenging to control.
20+: Extreme spread potential. Fires spread uncontrollably and can be extremely dangerous.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The boxplot shows variability in ISI values across years, with notable outliers in 2014.
The histogram indicates that most ISI values are low to moderate.

## `geom_smooth()` using formula = 'y ~ x'

ISI values tend to peak during the summer months, indicating higher fire spread potential during this period.

Analyses of the extreme outliers of 2014, values greater than 40

# Filter out entries with ISI values greater than 40
isi_outliers <- hotspots_peak %>%
  filter(isi >= 40)

# Check the number of outliers
dim(isi_outliers)
## [1] 28 43
isi_outliers %>% 
  count(event_cluster) %>% 
  arrange(desc(n))
## # A tibble: 5 × 2
##   event_cluster     n
##           <int> <int>
## 1           355    10
## 2           128     9
## 3            45     6
## 4           325     2
## 5           436     1
# Most isi outliers come from clusters 401 and 130
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.

## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.

The data from July 2014, shows very high ISI values at a specific location in British Columbia.
This matches the date of the Mount McAllister fire, a large wildfire that burned over 20,000 hectares.

Remove outliers

The cleaned data shows a clearer view of typical ISI values.

Buildup Index

A numerical rating of the total amount of fuel available for combustion.
It is derived from the Duff Moisture Code (DMC) and the Drought Code (DC).

Low: 0-40
Moderate: 41-80
High: 81-120
Extreme: 121 and above

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

BUI values mostly range from 60 to 150.
This means the fire potential is moderate to high, with some extreme cases, for most of the dataset.

## `geom_smooth()` using formula = 'y ~ x'

BUI peaks in mid-summer, which matches the time when fire activity is usually the highest.

Remove outliers

After removing outliers the typical BUI values are between 50 and 150,
indicating moderate to high fire potential.
Outliers have been reduced, but 2017 and 2021 still show higher BUI values.

Fire Weather Index

A numeric rating of fire intensity. It is based on the ISI and the BUI,
and is used as a general index of fire danger throughout the forested areas of Canada.

Low (0-5): Minimal fire danger.
Moderate (6-12): Fires can start from most accidental causes, but the spread is slow.
High (13-20): Fires can start easily and spread rapidly.
Very High (21-30): Fires will start very easily, spread rapidly, and burn intensely.
Extreme (31+): Fires start and spread quickly, and are intense and challenging to control.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram shows that most FWI values are between 10 and 40, fire danger is from moderate to extreme.
Many values are above 30, severe fire weather conditions are frequent.

2014 had some extreme outliers.

## `geom_smooth()` using formula = 'y ~ x'

The lineplot shows that FWI peaks in mid-summer, peak fire activity period.

The FWI is calculated using the Initial Spread Index and the Buildup Index,
both of which take into account wind speed,
temperature, humidity, and fuel moisture.
This is why FWI a reliable indicator of overall fire danger.

Fire agencies use the FWI to inform the public, issue fire bans, and respond to fires.
It helps prioritize resources and actions to mitigate fire risks effectively.

Analyses of the extreme outliers of 2014, values greater than 75

# Filter out entries with fwi values greater than 75
fwi_outliers <- hotspots_peak %>%
  filter(fwi >= 75)

# Check the number of outliers
fwi_outliers %>% 
  count(event_cluster) %>% 
  arrange(desc(n))
## # A tibble: 5 × 2
##   event_cluster     n
##           <int> <int>
## 1           355    10
## 2           128     9
## 3            45     6
## 4           325     2
## 5           436     1
# Most fwi outliers come from clusters 401 and 130, same as isi outliers
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.

## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.

The analysis of FWI extreme outliers in 2014, specifically values greater than 75,
reveals that these outliers are concentrated in the same event clusters as the ISI outliers
(clusters 355 and 128).
This corresponds to the Mount McAllister fire in British Columbia, which occurred in July 2014.

Remove outliers

After removing outliers, the FWI boxplots show more consistent distributions across the years.
Median FWI values generally fall between 20 and 40.

Summary table after removal of outliers

summary_fire_indices_clean <- describe_numerical(hotspots_peak_clean, c('ffmc', 'dmc', 'dc', 'isi', 'bui', 'fwi'))
print(summary_fire_indices_clean)
##   Variable Missing_Values   Min Median   Mean     Max
## 1     ffmc              0 55.37  91.64  90.85   99.00
## 2      dmc              0  0.00  87.19  95.75  459.78
## 3       dc              0  0.00 537.09 528.00 1122.11
## 4      isi              0  0.51   8.76   8.75   36.70
## 5      bui              0  0.00 120.77 127.70  458.66
## 6      fwi              0  2.89  30.55  30.17   56.96

Other Variables

These variables include additional factors that can influence fire events, such as fuel type and rate of spread.

fuel: Fuel Type
ros: Rate of Spread
hfi: Head Fire Intensity

Fuel Type

D1: Deciduous trees (leafless, early spring to fall)
C2, C3, C4, C5, C7: Various types of coniferous trees
O1, O1a, O1b: Grass or herbaceous vegetation
M1, M1M2, M2, M2_25, M2_35, M2_50, M2_65: Mixedwood or transitional vegetation
bog: Wetland areas with peatland vegetation
water: Water bodies
urban: Urban areas
non_fuel: Areas with no significant vegetation (rock, gravel)
low_veg: Areas with low vegetation (possibly recently burned or cleared)
farm: Agricultural areas
D2: Deadfall or downed wood

# Summary of other variables
summary_other_variables <- describe_numerical(hotspots_peak, c('ros', 'hfi'))
print(summary_other_variables)
##   Variable Missing_Values Min  Median    Mean      Max
## 1      ros              0   0    4.46    6.69    96.23
## 2      hfi              0   0 3883.00 8776.72 93142.00

Rate of Spread

The predicted speed of the fire at the front or head of the fire (where the fire moves fastest).
It is measured in metres per minute and is based on the Fuel Type, Initial Spread Index, Buildup Index, and some others.
The scale ranges from 0 to over 96.23 m/min in the dataset.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The plot shows that the ROS can vary significantly year to year, with some years experiencing more extreme fire spread conditions.

Remove outliers

The median ROS values now range between 3 and 10 m/min. Extreme values above 25 m/min have been significantly reduced.

Head Fire Intencity

Measures the intensity or energy output of a fire at its front (head).
HFI is measured in kilowatts per metre (kW/m) of the fire front and is calculated based on the Rate of Spread (ROS) and the Total Fuel Consumption (TFC).

Low (0-500 kW/m): Fires are relatively easy to control and generally cause limited damage.
Moderate (500-2000 kW/m): Fires can be challenging to control, requiring more resources and effort.
High (2000-4000 kW/m): Fires are intense and difficult to manage, often requiring aerial firefighting resources.
Very High (4000-10,000 kW/m): Fires are extremely intense and nearly impossible to control, posing significant risk to life and property.
Extreme (10,000+ kW/m): Fires exhibit explosive behavior and can cause catastrophic damage.

Helps predict fire destructiveness, allocate resources, warn or evacuate public.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Most HFI values are concentrated at the lower end of the scale.
Very high HFI values are outliers they show occasional extremely intense fires.
While most fires are less intense, the few high-intensity fires can be significant and impactful.

Boxplots show significant outliers in almost all years, there are some fires with extremely high intensities.

## `geom_smooth()` using formula = 'y ~ x'

Different years show a lot of differences in HFI values.
For example, 2014 and 2021 had higher peaks, meaning there was more intense fire activity in those years.
2020 had lower HFI values, there were milder fire conditions or better fire control efforts.

The plots also show how HFI can change from month to month within a single fire season.
Weather conditions, temperature, humidity, and wind speed, can affect how fires behave.

Compare HFI to the number of fire events.

# Use the previously normalized monthly_data table

# Maximum values for scaling
max_hfi <- max(monthly_data$avg_hfi, na.rm = TRUE)

The HFI index shows notable peaks, particularly in the years 2017, 2018, 2021 and 2023.

Analyses of the extreme outliers of 2014, 2018, 2021 and 2023, values greater than 75000

# Filter out entries with HFI values greater than 75000
hfi_outliers <- hotspots_peak %>%
  filter(hfi >= 75000)

# Check the number of outliers
hfi_outliers %>% 
  count(event_cluster) %>% 
  arrange(desc(n))
## # A tibble: 17 × 2
##    event_cluster     n
##            <int> <int>
##  1          4094    78
##  2          3983    49
##  3         11877    41
##  4          4024    29
##  5          4310     9
##  6          4089     8
##  7          4357     7
##  8          4218     5
##  9          7478     5
## 10         11970     5
## 11          4493     4
## 12            45     2
## 13          3956     2
## 14          4096     2
## 15          4503     2
## 16          5299     2
## 17          4011     1
# Most hfi outliers come from clusters 4094, 3983, 11877 and 4024
# See the details of the events with high hfi
cluster_summary %>%
  filter(event_cluster %in% c(4094, 3983, 11877, 4024))
## # A tibble: 4 × 7
##   event_cluster start_date          end_date            num_events latitude
##           <int> <dttm>              <dttm>                   <int>    <dbl>
## 1          3983 2018-08-11 11:29:00 2018-08-15 08:41:00        361     49.0
## 2          4024 2018-08-11 03:55:00 2018-08-14 03:22:38        313     49.5
## 3          4094 2018-07-18 15:42:00 2018-07-20 11:28:00        200     49.1
## 4         11877 2023-08-11 16:47:00 2023-08-20 05:05:00       6901     49.1
## # ℹ 2 more variables: longitude <dbl>, distance_to_kelowna_fire <dbl>

Snowy Mountain Fire and other fires in the Okanagan region in 2018.
Placer Mountain Fire area, a notable fire from 2018, which also burned in 2023.

## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.

## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.

## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.

## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.

Remove the outliers

The median HFI values are generally below 20,000 kW/m after removing outliers.

Tests and Models

In this section, we use statistical tests to see some meaningful patters and build models to predict fire risk.

Subset the hotspots dataset to focus on specific numerical variables.

# Define the numeric columns
numeric_columns <- c("temp", "rh", "ws", "pcp", "ffmc", "dmc", "dc", "isi", "bui", "fwi", "ros", "hfi")

# Subset the dataframe
hotspots_test <- hotspots_peak_clean %>%
  select(all_of(numeric_columns), year, month, rep_date, event_cluster)

str(hotspots_test)
## tibble [1,659,019 × 16] (S3: tbl_df/tbl/data.frame)
##  $ temp         : num [1:1659019] 21.2 26.8 27.8 23.4 24.6 28.8 25.5 17.2 28.2 26.1 ...
##  $ rh           : num [1:1659019] 32 27 25 43 27 30 29 36 23 21 ...
##  $ ws           : num [1:1659019] 12.4 4.4 11.1 6.5 6.7 10.6 7.6 13.6 9.7 9 ...
##  $ pcp          : num [1:1659019] 0 0 0 6.6 0 0 0 0.6 0 0 ...
##  $ ffmc         : num [1:1659019] 92.9 92.9 91.3 70.4 94.7 92.5 93.9 89 94.5 94.8 ...
##  $ dmc          : num [1:1659019] 114.9 89.1 83.3 70.4 63.8 ...
##  $ dc           : num [1:1659019] 550 503 632 382 255 ...
##  $ isi          : num [1:1659019] 12 8.1 9 0.9 11.7 10.5 10.9 7.3 13.2 13.2 ...
##  $ bui          : num [1:1659019] 150.9 123.5 125.3 96.4 78.5 ...
##  $ fwi          : num [1:1659019] 41.4 29.9 32.3 4.3 31.1 39 37.1 29.1 39.4 42 ...
##  $ ros          : num [1:1659019] 0.624 10.151 0.415 0.436 16.388 ...
##  $ hfi          : num [1:1659019] 263 13691 168 439 18430 ...
##  $ year         : num [1:1659019] 2014 2014 2014 2014 2014 ...
##  $ month        : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 8 8 8 8 7 8 8 8 8 7 ...
##  $ rep_date     : POSIXct[1:1659019], format: "2014-08-06 05:11:00" "2014-08-14 10:14:00" ...
##  $ event_cluster: int [1:1659019] 1 2 3 4 5 6 6 7 8 9 ...
# List of fire indices to summarize
fire_indices <- c('ffmc', 'dmc', 'dc', 'isi', 'bui', 'fwi')

# Apply the describe_numerical function to the fire indices in the hotspots_test dataset
summary_fire_indices <- describe_numerical(hotspots_test, fire_indices)

# Print the summary statistics for the fire indices
print(summary_fire_indices)
##   Variable Missing_Values Min Median   Mean     Max
## 1     ffmc              0 0.3  91.36  89.32   99.00
## 2      dmc              0 0.0  83.41  90.12  459.78
## 3       dc              0 0.0 530.41 516.47 1122.11
## 4      isi              0 0.0   8.44   8.24  137.70
## 5      bui              0 0.0 117.65 121.04  458.66
## 6      fwi              0 0.0  29.52  28.37  183.90

During EDA we looked into distributions of six key Fire Indices across different years.

The distributions of these indices for a particular year, 2018,

appeared to be close to the overall average values of these indices.

We also found out that 2018 had some significant fire events.

# Average number of observations per year
nrow(hotspots_peak) / length(unique(hotspots_peak$year))
## [1] 173024.2
# Average number of unique event_clusters per year
length(unique(hotspots_peak$event_cluster)) / length(unique(hotspots_peak$year))
## [1] 1502.6
# Number of observations in 2018
nrow(filter(hotspots_peak_clean, year == 2018))
## [1] 339604
# Number of unique event clusters in 2018
length(unique(filter(hotspots_peak_clean, year == 2018)$event_cluster))
## [1] 1932

From these calculations, we found that 2018 had a significantly higher number of observations and event clusters compared to the average over the ten-year period.

Despite the higher number of fire events, the main indices (FFMC, DMC, DC, ISI, BUI, FWI) remained close to their median values across the dataset.

Therefore we wanted to test if the increased number of fires in 2018 was due to random chance or if other factors were at play.

We need to perform statistical tests to compare the indices for 2018 against the overall dataset.

A p-value greater than 0.05 would suggest that the higher number of fires could be due to a chance,
and a lower p-value would mean the presence of other contributing factors.

Summary statistics for before T Test

# Summary statistics for the overall dataset
summary_overall <- describe_numerical(hotspots_peak, fire_indices)
summary_overall
##   Variable Missing_Values Min Median   Mean     Max
## 1     ffmc              0 0.3  91.50  89.53   99.00
## 2      dmc              0 0.0  85.20  92.60  459.78
## 3       dc              0 0.0 531.77 518.81 1122.11
## 4      isi              0 0.0   8.62   8.50  137.70
## 5      bui              0 0.0 119.12 123.42  458.66
## 6      fwi              0 0.0  30.09  29.13  183.90
# Filter data for the year 2018
hotspots_test_2018 <- hotspots_test %>% filter(year == 2018)

# Summary statistics for the year 2018
summary_2018 <- describe_numerical(hotspots_test_2018, fire_indices)
summary_2018
##   Variable Missing_Values   Min Median   Mean    Max
## 1     ffmc              0 11.38  91.27  90.07  99.00
## 2      dmc              0  0.00 100.13  97.82 343.14
## 3       dc              0  0.00 492.52 486.80 878.16
## 4      isi              0  0.00   8.22   7.90  24.81
## 5      bui              0  0.00 133.64 128.35 341.15
## 6      fwi              0  0.00  30.72  28.83  57.93

Check the histograms if they follow the normal distribution.

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##   Index Overall_Median Year_2018_Median Difference Percentage_Difference
## 1  ffmc          91.50            91.27      -0.23                 -0.25
## 2   dmc          85.20           100.13      14.93                 17.52
## 3    dc         531.77           492.52     -39.25                 -7.38
## 4   isi           8.62             8.22      -0.40                 -4.64
## 5   bui         119.12           133.64      14.52                 12.19
## 6   fwi          30.09            30.72       0.63                  2.09

Wilcoxon test

During our normality checks, some of the variables did not show a smooth bell curve, meaning that they do not follow a normal distribution. We decided to perform nonparametric tests, which do not assume a normal distribution of the data, to compare the results.

# Perform Wilcoxon tests
test_results_wilcoxon <- lapply(fire_indices, function(index) {
  wilcox.test(hotspots_peak_clean[[index]], hotspots_test_2018[[index]])
})
test_results_wilcoxon
## [[1]]
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 2.9981e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## [[2]]
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 2.2308e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## [[3]]
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 3.2781e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## [[4]]
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 3.0155e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## [[5]]
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 2.329e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 
## 
## [[6]]
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  hotspots_peak_clean[[index]] and hotspots_test_2018[[index]]
## W = 2.761e+11, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# Print summary table for Wilcoxon test
summary_table_wilcoxon <- create_summary_table(test_results_wilcoxon)
print(summary_table_wilcoxon)
##   Index Overall_Mean Y_2018_Mean Overall_Median Y_2018_Median   P_Value
## 1  ffmc        89.53       90.07          91.50         91.27 <0.000001
## 2   dmc        92.60       97.82          85.20        100.13 <0.000001
## 3    dc       518.81      486.80         531.77        492.52 <0.000001
## 4   isi         8.50        7.90           8.62          8.22 <0.000001
## 5   bui       123.42      128.35         119.12        133.64 <0.000001
## 6   fwi        29.13       28.83          30.09         30.72 <0.000001
##   Significant
## 1         Yes
## 2         Yes
## 3         Yes
## 4         Yes
## 5         Yes
## 6         Yes

All six fire indices show significant differences between the values in 2018 and the overall values across all years.

This suggests that the conditions in 2018 were statistically different from the overall conditions in the dataset.

There were likely specific factors in 2018 that contributed to these differences.

While the t-tests show that there are significant differences in the fire indices, they don’t prove that higher values of these indices cause more fires.

However, the higher values of indices like FWI, DMC, and others in 2018 suggest there is a link, meaning higher values of these indices likely mean conditions for more fires.

To find the relationship between these indices and the number of fires, need to make more analysis, like regression modeling.

Analysis of Two Similar Clusters

Further we wanted to test similar clusters.

We need to find two clusters of fire events with similar characteristics, mean indices, perform statistical tests to compare their indices, and visualize the results.

We selected clusters based on the similarity of their mean Fire Weather Index (FWI) and Drought Code (DC).

# Calculate summary statistics for each cluster
cluster_summary_indices <- hotspots_test %>%
  group_by(event_cluster) %>%
  summarise(
    num_events = n(),          
    mean_fwi = mean(fwi),  # Mean FWI value for each cluster
    mean_dc = mean(dc)     # Mean DC value for each cluster
  )

# Filter clusters that have 1000+ events
chosen_clusters <- cluster_summary_indices %>%
  filter(num_events >= 1000 & num_events <= 6000)

# Function to calculate how similar mean values are
calculate_similar <- function(df) {
  df %>%
    mutate(
      fwi_diff = abs(mean_fwi - mean(chosen_clusters$mean_fwi)),
      dc_diff = abs(mean_dc - mean(chosen_clusters$mean_dc)),
      similar = fwi_diff + dc_diff
    ) %>%
    arrange(similar)
}

# Apply the function and choose two clusters with the closest mean FWI and DC
chosen_clusters <- chosen_clusters %>%
  calculate_similar() %>%
  slice(1:5)

# Chosen clusters
print(chosen_clusters)
## # A tibble: 5 × 7
##   event_cluster num_events mean_fwi mean_dc fwi_diff dc_diff similar
##           <int>      <int>    <dbl>   <dbl>    <dbl>   <dbl>   <dbl>
## 1          3953       3460     32.5    523.     2.67   0.192    2.86
## 2          4009       1217     31.7    520.     1.92   3.32     5.24
## 3          3949       1661     32.3    528.     2.45   3.95     6.40
## 4         13137       1122     27.2    520.     2.64   3.99     6.64
## 5          8019       3359     35.8    528.     6.00   3.98     9.98
# Chosen cluster IDs
chosen_ids <- c(4009, 13137)

# Summary table for the chosen clusters
cluster_summary <- hotspots_test %>%
  filter(event_cluster %in% chosen_ids) %>%
  group_by(event_cluster, year, month) %>%
  summarize(
    count = n(),
    mean_ffmc = mean(ffmc, na.rm = TRUE),
    mean_dmc = mean(dmc, na.rm = TRUE),
    mean_dc = mean(dc, na.rm = TRUE),
    mean_isi = mean(isi, na.rm = TRUE),
    mean_bui = mean(bui, na.rm = TRUE),
    mean_fwi = mean(fwi, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'event_cluster', 'year'. You can override
## using the `.groups` argument.
print(cluster_summary)
## # A tibble: 2 × 10
## # Groups:   event_cluster, year [2]
##   event_cluster  year month count mean_ffmc mean_dmc mean_dc mean_isi mean_bui
##           <int> <dbl> <ord> <int>     <dbl>    <dbl>   <dbl>    <dbl>    <dbl>
## 1          4009  2018 Aug    1217      90.3    108.     520.     8.27    142. 
## 2         13137  2023 Aug    1122      92.6     53.7    520.     9.10     85.1
## # ℹ 1 more variable: mean_fwi <dbl>

Plot the clusters on the map

## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
## ℹ 156 tiles needed, this may take a while (try a smaller zoom?)

Subset the dataframe for the chosen clusters

# Filter data for the two chosen clusters
cluster_1 <- hotspots_test %>% filter(event_cluster == chosen_ids[1])
cluster_2 <- hotspots_test %>% filter(event_cluster == chosen_ids[2])

Visualize the distribution

# Perform t-test on DC for the two clusters
t_test_result <- t.test(cluster_1$dc, cluster_2$dc, alternative = "two.sided")

# Print the t-test result
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  cluster_1$dc and cluster_2$dc
## t = 0.40558, df = 2184.3, p-value = 0.6851
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.568158  3.907413
## sample estimates:
## mean of x mean of y 
##  520.3556  519.6860

The t-test shows no significant difference in the DC values between the two clusters (cluster_1 and cluster_2).

The p-value is greater than 0.05, and the confidence interval suggests that the observed differences could be due to random chance rather than a true difference in means.

Therefore, the DC values for these clusters are statistically similar.

Linear regression model

Analyze the weekly number of fires using various fire index predictors. The goal is to build a linear regression model to predict the number of fires and see its performance.

# Create a week column
hotspots_test$week <- format(hotspots_test$rep_date, "%Y-%U")

# Create weekly summary with event counts (fires) and average weekly indices
weekly_summary <- hotspots_test %>%
  group_by(week) %>%
  summarise(
    fires = n(),
    hfi = mean(hfi, na.rm = TRUE),
    temp = mean(temp, na.rm = TRUE),
    rh = mean(rh, na.rm = TRUE),
    ws = mean(ws, na.rm = TRUE),
    pcp = mean(pcp, na.rm = TRUE),
    ffmc = mean(ffmc, na.rm = TRUE),
    dmc = mean(dmc, na.rm = TRUE),
    dc = mean(dc, na.rm = TRUE),
    isi = mean(isi, na.rm = TRUE),
    bui = mean(bui, na.rm = TRUE),
    fwi = mean(fwi, na.rm = TRUE),
    ros = mean(ros, na.rm = TRUE)
  )
weekly_summary
## # A tibble: 263 × 14
##    week    fires    hfi  temp    rh    ws      pcp  ffmc   dmc    dc   isi   bui
##    <chr>   <int>  <dbl> <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 2014-17     6  5013.  19.5  24.3 10.1   0        91.6  21.4 124.   8.97  29.3
##  2 2014-18    57  1420.  10.1  23.4 12.9   7.02e-4  90.8  21.6  57.9 10.3   22.8
##  3 2014-19    98   914.  18.7  31.4  7.63  2.17e+0  83.9  27.3  69.5  6.39  28.7
##  4 2014-20     3  2907.  13.2  20   19.1   0        91.6  46.4 108   14.2   46.3
##  5 2014-21    21  2267.  18.9  37.0  6.64  3.95e-2  89.8  42.1 121.   6.6   45.6
##  6 2014-22    74 10103.  20.8  22.7  9.75  2.70e-4  93.1  68.3 160.  11.2   70.1
##  7 2014-23     8  7724   19.0  42.8 10.2   4.5 e+0  78.4  51.2 220.   6.78  63.0
##  8 2014-24     3     0   18    45    5.6   1.13e+1  52.8  16.8 313.   0.3   29.7
##  9 2014-25     7   799.  20.8  35.1  6.69  0        83.5  19.3 156.   2.57  29.4
## 10 2014-26     6  2390.  21.8  39.3 12.7   6.67e-1  88.6  49.1 283.   6.93  68.5
## # ℹ 253 more rows
## # ℹ 2 more variables: fwi <dbl>, ros <dbl>

Transform the number of fires to get more normally distributed data

# Best parameter Box-Cox 
boxcox_result <- MASS::boxcox(lm(fires ~ 1, data = weekly_summary))

lm_best <- boxcox_result$x[which.max(boxcox_result$y)]
lm_best
## [1] 0.02020202
# Transform with lm_best
weekly_summary <- weekly_summary %>%
  mutate(boxcox_fires = ((fires + 1)^lm_best - 1) / lm_best)

# Model 1
model_1 <- lm(boxcox_fires ~ dmc + dc + bui + fwi + hfi + ffmc + isi, data = weekly_summary)
summary(model_1)
## 
## Call:
## lm(formula = boxcox_fires ~ dmc + dc + bui + fwi + hfi + ffmc + 
##     isi, data = weekly_summary)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9510 -1.5283 -0.0813  1.6149  6.9366 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.989e+00  1.784e+00   2.236 0.026250 *  
## dmc          1.087e-01  3.181e-02   3.417 0.000737 ***
## dc           1.236e-02  2.257e-03   5.477 1.04e-07 ***
## bui         -1.311e-01  3.380e-02  -3.879 0.000134 ***
## fwi          2.760e-01  8.356e-02   3.302 0.001095 ** 
## hfi          2.107e-04  6.325e-05   3.331 0.000993 ***
## ffmc        -3.667e-03  2.366e-02  -0.155 0.876942    
## isi         -5.540e-01  1.725e-01  -3.212 0.001486 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.154 on 255 degrees of freedom
## Multiple R-squared:  0.4635, Adjusted R-squared:  0.4488 
## F-statistic: 31.47 on 7 and 255 DF,  p-value: < 2.2e-16
# Check for multicollinearity
vif(model_1)
##        dmc         dc        bui        fwi        hfi       ffmc        isi 
##  88.184858   8.172096 155.390803  58.119286   3.557544   5.034248  25.387775
# Remove bui
model_no_bui <- lm(boxcox_fires ~ dmc + dc + fwi + hfi + ffmc + isi, data = weekly_summary)
summary(model_no_bui)
## 
## Call:
## lm(formula = boxcox_fires ~ dmc + dc + fwi + hfi + ffmc + isi, 
##     data = weekly_summary)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3174 -1.6061 -0.0923  1.5856  6.5768 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.620e+00  1.560e+00   4.884 1.83e-06 ***
## dmc         -8.390e-03  1.032e-02  -0.813 0.416854    
## dc           5.504e-03  1.441e-03   3.819 0.000168 ***
## fwi          1.507e-01  7.915e-02   1.904 0.058067 .  
## hfi          2.418e-04  6.444e-05   3.753 0.000216 ***
## ffmc        -4.922e-02  2.110e-02  -2.333 0.020410 *  
## isi         -3.101e-01  1.649e-01  -1.880 0.061247 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.212 on 256 degrees of freedom
## Multiple R-squared:  0.4318, Adjusted R-squared:  0.4185 
## F-statistic: 32.43 on 6 and 256 DF,  p-value: < 2.2e-16
# Check for multicollinearity
vif(model_no_bui)
##       dmc        dc       fwi       hfi      ffmc       isi 
##  8.793841  3.157935 49.437948  3.500272  3.793957 22.012746
# Remove fwi
model_no_fwi <- lm(boxcox_fires ~ dmc + dc + hfi + ffmc + isi, data = weekly_summary)
summary(model_no_fwi)
## 
## Call:
## lm(formula = boxcox_fires ~ dmc + dc + hfi + ffmc + isi, data = weekly_summary)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6039 -1.6627 -0.0623  1.5154  5.6961 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.433e+00  1.438e+00   4.475 1.15e-05 ***
## dmc          6.331e-03  6.865e-03   0.922    0.357    
## dc           7.086e-03  1.183e-03   5.988 7.11e-09 ***
## hfi          2.790e-04  6.173e-05   4.519 9.47e-06 ***
## ffmc        -4.130e-02  2.079e-02  -1.987    0.048 *  
## isi         -3.511e-02  8.004e-02  -0.439    0.661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.223 on 257 degrees of freedom
## Multiple R-squared:  0.4238, Adjusted R-squared:  0.4126 
## F-statistic:  37.8 on 5 and 257 DF,  p-value: < 2.2e-16
# Check for multicollinearity
vif(model_no_fwi)
##      dmc       dc      hfi     ffmc      isi 
## 3.853991 2.107379 3.179478 3.646212 5.131314
# Remove non-significant predictors
model_final <- lm(boxcox_fires ~ dc + hfi + ffmc, data = weekly_summary)
summary(model_final)
## 
## Call:
## lm(formula = boxcox_fires ~ dc + hfi + ffmc, data = weekly_summary)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8259 -1.6849 -0.0558  1.5553  5.5430 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.171e+00  1.079e+00   5.720 2.93e-08 ***
## dc           7.872e-03  8.840e-04   8.905  < 2e-16 ***
## hfi          2.819e-04  4.518e-05   6.241 1.76e-09 ***
## ffmc        -4.025e-02  1.328e-02  -3.030  0.00269 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.219 on 259 degrees of freedom
## Multiple R-squared:  0.4215, Adjusted R-squared:  0.4148 
## F-statistic:  62.9 on 3 and 259 DF,  p-value: < 2.2e-16
# Check for multicollinearity
vif(model_final)
##       dc      hfi     ffmc 
## 1.180492 1.709183 1.494406

Example use of the model

Test 1

# Create a dataframe with hypothetical conditions (moderate, high, extreme)
test_1 <- data.frame(
  dc = c(200, 400, 600),     
  hfi = c(1000, 7000, 12000), 
  ffmc = c(50, 80, 95)      
)
# Predict the Box-Cox transformed number of fires
predicted_boxcox_fires_test_1 <- predict(model_final, newdata = test_1)

# Reverse Box-Cox transformation
reverse_boxcox <- function(y, lambda) {
  if (lambda == 0) {
    exp(y) - 1
  } else {
    ((y * lambda) + 1)^(1 / lambda) - 1
  }
}

# Transform to original scale
predicted_fires_test_1 <- reverse_boxcox(predicted_boxcox_fires_test_1, lm_best)
# Model predictions
result_test_1 <- cbind(test_1, predicted_fires_test_1)
print(result_test_1)
##    dc   hfi ffmc predicted_fires_test_1
## 1 200  1000   50               290.7634
## 2 400  7000   80              1767.9751
## 3 600 12000   95             13139.9558

Test 2

# Create a dataframe with hypothetical conditions (only HFI changes)
test_2 <- data.frame(
  dc = rep(300, 5),                # Constant value for Drought Code
  hfi = seq(2000, 10000, by = 2000),# Varying values for Head Fire Intensity
  ffmc = rep(70, 5)                # Constant value for Fine Fuel Moisture Code
)
# Predict the Box-Cox transformed number of fires
predicted_boxcox_fires_test_2 <- predict(model_final, newdata = test_2)

# Transform to original scale
predicted_fires_test_2 <- reverse_boxcox(predicted_boxcox_fires_test_2, lm_best)
# Model predictions
result_test_2 <- cbind(test_2, predicted_fires_test_2)
print(result_test_2)
##    dc   hfi ffmc predicted_fires_test_2
## 1 300  2000   70               368.0153
## 2 300  4000   70               606.1323
## 3 300  6000   70               992.9600
## 4 300  8000   70              1618.3602
## 5 300 10000   70              2624.7191

Test 3 Kelowna fire

kelowna_cluster_ids <- kelowna_clusters$event_cluster

# Summary table for Kelowna
kelowna_summary <- hotspots_test %>%
  filter(event_cluster %in% kelowna_cluster_ids) %>%
  group_by(event_cluster, year, month) %>%
  summarize(
    count = n(),
    mean_dc = mean(dc, na.rm = TRUE),
    mean_hfi = mean(hfi, na.rm = TRUE),
    mean_ffmc = mean(ffmc, na.rm = TRUE),
    start_date = min(rep_date),
    end_date = max(rep_date),
    .groups = 'drop'  
  )
print(kelowna_summary)
## # A tibble: 7 × 9
##   event_cluster  year month count mean_dc mean_hfi mean_ffmc start_date         
##           <int> <dbl> <ord> <int>   <dbl>    <dbl>     <dbl> <dttm>             
## 1         11964  2023 Aug     982    866.    3415.      91.7 2023-08-25 12:44:00
## 2         11969  2023 Aug    2153    943.    7212.      93.9 2023-08-16 16:54:00
## 3         13047  2023 Aug     446    889.    3199.      91.8 2023-08-20 16:28:00
## 4         13124  2023 Aug     252    886.    3788.      92.1 2023-08-21 16:09:00
## 5         13384  2023 Aug      15    894.    1240       87.4 2023-08-23 01:51:00
## 6         13402  2023 Aug       4   1017.     146.      83.0 2023-08-23 03:19:00
## 7         13694  2023 Aug       8    943.     249.      83.5 2023-08-25 01:36:00
## # ℹ 1 more variable: end_date <dttm>
# Create a dataframe with Kelowna data
test_Kelowna <- data.frame(
  dc = kelowna_summary$mean_dc,     
  hfi = kelowna_summary$mean_hfi, 
  ffmc = kelowna_summary$mean_ffmc        
)
predicted_boxcox_fires_test_Kelowna <- predict(model_final, newdata = test_Kelowna)

predicted_fires_test_Kelowna <- reverse_boxcox(predicted_boxcox_fires_test_Kelowna, lm_best)

# Result
result_test_Kelowna <- cbind(test_Kelowna, predicted_fires_test_Kelowna)
result_test_Kelowna
##          dc      hfi     ffmc predicted_fires_test_Kelowna
## 1  865.9385 3414.656 91.68305                     11194.53
## 2  942.7162 7212.464 93.90707                     40926.27
## 3  889.2258 3198.713 91.80471                     12338.04
## 4  886.0876 3788.135 92.06285                     13748.79
## 5  894.1928 1240.000 87.39640                      9340.41
## 6 1017.0992  146.250 82.97425                     18643.13
## 7  943.0106  249.375 83.53512                     11592.75
mean(result_test_Kelowna$predicted_fires_test_Kelowna, na.rm = TRUE)
## [1] 16826.28

Test 4 Kelowna fire with model 1

# Summary table for Kelowna
kelowna_summary <- hotspots_test %>%
  filter(event_cluster %in% kelowna_cluster_ids) %>%
  group_by(event_cluster, year, month) %>%
  summarize(
    count = n(),
    mean_dc = mean(dc, na.rm = TRUE),
    mean_hfi = mean(hfi, na.rm = TRUE),
    mean_ffmc = mean(ffmc, na.rm = TRUE),
    mean_dmc = mean(dmc, na.rm = TRUE),
    mean_isi = mean(isi, na.rm = TRUE),
    mean_bui = mean(bui, na.rm = TRUE),
    mean_fwi = mean(fwi, na.rm = TRUE),
    start_date = min(rep_date),
    end_date = max(rep_date),
    .groups = 'drop'  
  )
print(kelowna_summary)
## # A tibble: 7 × 13
##   event_cluster  year month count mean_dc mean_hfi mean_ffmc mean_dmc mean_isi
##           <int> <dbl> <ord> <int>   <dbl>    <dbl>     <dbl>    <dbl>    <dbl>
## 1         11964  2023 Aug     982    866.    3415.      91.7     98.8     8.18
## 2         11969  2023 Aug    2153    943.    7212.      93.9    179.     16.0 
## 3         13047  2023 Aug     446    889.    3199.      91.8    166.      7.87
## 4         13124  2023 Aug     252    886.    3788.      92.1    167.      7.94
## 5         13384  2023 Aug      15    894.    1240       87.4    166.      5.21
## 6         13402  2023 Aug       4   1017.     146.      83.0    187.      2.69
## 7         13694  2023 Aug       8    943.     249.      83.5    112.      3.05
## # ℹ 4 more variables: mean_bui <dbl>, mean_fwi <dbl>, start_date <dttm>,
## #   end_date <dttm>
# Create a dataframe with Kelowna data
test_Kelowna_2 <- data.frame(
  dc = kelowna_summary$mean_dc,     
  hfi = kelowna_summary$mean_hfi, 
  ffmc = kelowna_summary$mean_ffmc,
  dmc = kelowna_summary$mean_dmc,
  isi = kelowna_summary$mean_isi,
  bui = kelowna_summary$mean_bui,
  fwi = kelowna_summary$mean_fwi
)
predicted_boxcox_fires_test_Kelowna_2 <- predict(model_1, newdata = test_Kelowna_2)

predicted_fires_test_Kelowna_2 <- reverse_boxcox(predicted_boxcox_fires_test_Kelowna_2, lm_best)

# Result
result_test_Kelowna_2 <- cbind(test_Kelowna_2, predicted_fires_test_Kelowna_2)
result_test_Kelowna_2
##          dc      hfi     ffmc       dmc       isi      bui      fwi
## 1  865.9385 3414.656 91.68305  98.82007  8.181949 153.6726 32.15215
## 2  942.7162 7212.464 93.90707 179.38759 16.005370 243.0274 52.63774
## 3  889.2258 3198.713 91.80471 166.09449  7.865226 226.4096 33.58339
## 4  886.0876 3788.135 92.06285 166.91505  7.942167 226.9296 33.81845
## 5  894.1928 1240.000 87.39640 165.54847  5.211667 226.2989 25.38913
## 6 1017.0992  146.250 82.97425 187.42925  2.691250 256.6035 15.87200
## 7  943.0106  249.375 83.53512 112.02188  3.051125 172.6675 16.55725
##   predicted_fires_test_Kelowna_2
## 1                      9104.7784
## 2                      9910.6061
## 3                      2773.5525
## 4                      3092.0990
## 5                      1011.5985
## 6                       264.3268
## 7                      1409.2541
mean(result_test_Kelowna_2$predicted_fires_test_Kelowna_2, na.rm = TRUE)
## [1] 3938.031